El objetivo de este proyecto es hacer un EDA, sobre este dataSet para demostrar que tipo de preguntas se pueden responder con este dataSet de calidad de vino.
Este DataSet es una muestra de diferentes vinos rojos con su respectivo componentes quimicos, cada columna representa una diferente muestra de vino, y las columnas representa un propiedades especificas que podrian tener impacto en el sabor y la calidad.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()
ruta = "../Mis Proyectos EDA/dataSets/winequality-red.csv"
df = pd.read_csv(ruta, delimiter=';')
df.sample(10)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 461 | 8.3 | 0.615 | 0.22 | 2.6 | 0.087 | 6.0 | 19.0 | 0.99820 | 3.26 | 0.61 | 9.3 | 5 |
| 925 | 8.6 | 0.220 | 0.36 | 1.9 | 0.064 | 53.0 | 77.0 | 0.99604 | 3.47 | 0.87 | 11.0 | 7 |
| 665 | 9.4 | 0.590 | 0.14 | 2.0 | 0.084 | 25.0 | 48.0 | 0.99810 | 3.14 | 0.56 | 9.7 | 5 |
| 1331 | 7.8 | 0.870 | 0.26 | 3.8 | 0.107 | 31.0 | 67.0 | 0.99668 | 3.26 | 0.46 | 9.2 | 5 |
| 1541 | 7.4 | 0.250 | 0.29 | 2.2 | 0.054 | 19.0 | 49.0 | 0.99666 | 3.40 | 0.76 | 10.9 | 7 |
| 1166 | 9.9 | 0.540 | 0.26 | 2.0 | 0.111 | 7.0 | 60.0 | 0.99709 | 2.94 | 0.98 | 10.2 | 5 |
| 757 | 8.1 | 0.870 | 0.00 | 2.2 | 0.084 | 10.0 | 31.0 | 0.99656 | 3.25 | 0.50 | 9.8 | 5 |
| 585 | 7.6 | 0.510 | 0.24 | 2.4 | 0.091 | 8.0 | 38.0 | 0.99800 | 3.47 | 0.66 | 9.6 | 6 |
| 1472 | 7.6 | 0.350 | 0.60 | 2.6 | 0.073 | 23.0 | 44.0 | 0.99656 | 3.38 | 0.79 | 11.1 | 6 |
| 1195 | 6.2 | 0.430 | 0.22 | 1.8 | 0.078 | 21.0 | 56.0 | 0.99633 | 3.52 | 0.60 | 9.5 | 6 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
df['quality'].value_counts().
Index([5, 6, 7, 4, 8, 3], dtype='int64', name='quality')
valores = df['quality'].value_counts()
x = valores.index
fig = plt.figure(figsize=(10,5))
ax = sns.barplot(x = x, y=valores)
ax.set_title('Numero de vinos por calidad')
ax.set_xlabel('Calidad')
ax.set_ylabel('Numero de vinos')
plt.show()
fig = plt.figure(figsize=(10,5))
ax = plt.bar(x = x,
height = valores,
color = ['blue', 'gray', 'gray', 'gray', 'gray', 'gray'])
plt.title('La calidad que tiene mas muestras de vinos')
plt.xlabel('Calidad')
plt.ylabbl('Numero de vinos')
plt.show()
La calidad que mas se repite en los datos es la calidad 5, seguida por detras de la calidad 6, si ponemos antencion a nuestros datos podemos observa que los que califican a los vinos tienden calificar en un punto medio
x = df['quality']
promedio = df['quality'].mean().round(2)
plt.figure(figsize=(10,5))
ax = sns.boxplot(x = x, color = '#A62C2C')
ax.set_title('Bloxplot de los datos,\n media {}'.format(promedio))
ax.set_xlabel('Calidad')
plt.show()
nombre = ['alcohol', 'citric acid', 'pH']
colores = ['#693382', '#336D82', '#5F99AE']
fig, axs = plt.subplots(1,3, figsize=(15, 5))
for i, ax in enumerate(axs):
ax.boxplot(x = df[nombre[i]])
ax.set_title(nombre[i])
ax.set_xlabel(nombre[i])
plt.tight_layout()
plt.show()
def boxplotReescalosDeVariables(datos, variables):
figure = plt.figure(figsize=(10,5))
for i, variable in enumerate(variables):
plt.boxplot(x = pd.DataFrame(reEscalarVariables(variable, datos)), positions=[i + 1])
plt.xticks(labels = variables, ticks = range(1, len(variables)+ 1))
plt.title('Box-plot de variables escaladas')
plt.show()
boxplotReescalosDeVariables(df, nombre)
df_scale = pd.DataFrame()
for columna in df.drop('quality', axis = 1).columns:
df_scale[columna] = reEscalarVariables(nombre=columna, datos=df)
df_scale['quality'] = df['quality']
df_scale.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.247788 | 0.397260 | 0.00 | 0.068493 | 0.106845 | 0.140845 | 0.098940 | 0.567548 | 0.606299 | 0.137725 | 0.153846 | 5 |
| 1 | 0.283186 | 0.520548 | 0.00 | 0.116438 | 0.143573 | 0.338028 | 0.215548 | 0.494126 | 0.362205 | 0.209581 | 0.215385 | 5 |
| 2 | 0.283186 | 0.438356 | 0.04 | 0.095890 | 0.133556 | 0.197183 | 0.169611 | 0.508811 | 0.409449 | 0.191617 | 0.215385 | 5 |
| 3 | 0.584071 | 0.109589 | 0.56 | 0.068493 | 0.105175 | 0.225352 | 0.190813 | 0.582232 | 0.330709 | 0.149701 | 0.215385 | 6 |
| 4 | 0.247788 | 0.397260 | 0.00 | 0.068493 | 0.106845 | 0.140845 | 0.098940 | 0.567548 | 0.606299 | 0.137725 | 0.153846 | 5 |
def medias(datos, calidad):
return datos[datos['quality'] == calidad].drop('quality', axis = 1).mean()
medias(df, 3)
fixed acidity 8.360000 volatile acidity 0.884500 citric acid 0.171000 residual sugar 2.635000 chlorides 0.122500 free sulfur dioxide 11.000000 total sulfur dioxide 24.900000 density 0.997464 pH 3.398000 sulphates 0.570000 alcohol 9.955000 dtype: float64
def comparacion_estadisticas(datos, calidades):
fig = go.Figure()
for calidad in calidades:
calidad_i = medias(datos, calidad)
fig.add_trace(go.Scatterpolar(r = calidad_i.values,
theta = calidad_i.index,
name = calidad,
fill = 'toself',
opacity = 0.3,
hoveron = 'points')
)
fig.update_layout(showlegend = True,
polar = {'radialaxis' : {'visible':False}},
template = 'plotly_white')
fig.show()
calidades = [5, 6]
comparacion_estadisticas(df_scale, calidades)
calidades = [3, 8]
comparacion_estadisticas(df_scale, calidades)
calidades = [4, 7]
comparacion_estadisticas(df_scale, calidades)
calidades = [3, 4, 5, 6, 7, 8]
comparacion_estadisticas(df_scale, calidades)